package com.example.spider.provider.why10w.bmcx;

import cn.hutool.core.util.ReUtil;
import com.example.spider.provider.why10w.AbstractWhy10wDetailResultHandler;
import com.example.spider.provider.why10w.dto.Why10wDetailTask;
import com.example.spider.provider.why10w.dto.WhyTaskResult;
import lombok.extern.slf4j.Slf4j;
import org.springframework.util.CollectionUtils;

import java.util.Collections;
import java.util.List;

/**
 * 处理 bmcx 详情页结果 限流比较严重，会短时间封ip
 *
 * @author lym
 */
@Slf4j
//@Component
public class BmcxDetailResultHandler extends AbstractWhy10wDetailResultHandler {

    /**
     * handle
     */
    @Override
    public void doHandle(Why10wDetailTask task, String html) {
        // 要解析的太多了，先截取期望部分，避免每次解析全部html
        String title = task.getTitle();

        List<String> contentMatchResult = ReUtil.findAllGroup1("<div class=\"neirong\">(.*?)</div>", html);
        if (CollectionUtils.isEmpty(contentMatchResult)) {
            log.warn("illegal BmcxDetail result " + task.getUrl());
            return;
        }
        String content = contentMatchResult.get(0);

        // 去掉引流标
        content = content.replaceAll("<span class=\"charu_yc_url\">.*?</span>", "");
        // 去掉中文间的空格
        content = ReUtil.replaceAll(content, "([\\u4e00-\\u9fa5])\\s+([\\u4e00-\\u9fa5])", "$1$2");
        String url = task.getUrl();
        String id = url.replace("https://swgwsm.bmcx.com/", "").replace("/", "");

        WhyTaskResult result = WhyTaskResult.builder()
                .id(id)
                .title(title)
                .url(task.getUrl())
                .categoryList(Collections.emptyList())
                .tagList(Collections.emptyList())
                .imageInfoList(Collections.emptyList())
                .content(content)
                .source(task.getSource())
                .build();
        // ***********************

        repository.addResult(result);

        repository.saveHtml(task, html);
    }
}
